library(sp.scRNAseq)
library(ggplot2)
library(ggthemes)
library(plyr)
library(parallel)
library(pso)
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:sp.scRNAseq':
##
## counts, counts<-
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colnames,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unsplit, which, which.max,
## which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:plyr':
##
## rename
## The following objects are masked from 'package:base':
##
## colMeans, colSums, expand.grid, rowMeans, rowSums
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:plyr':
##
## desc
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
library("BiocParallel")
load("~/Desktop/sp.scRNAseqHalfTime/halfTime.rda")
rm(cObjSng)
rm(cObjMul)
rm(sObj)
sng <- grepl("^s", colnames(expCounts))
cObjSng <- spCounts(expCounts[,sng], expErcc[,sng])
cObjMul <- spCounts(expCounts[,!sng], expErcc[,!sng])
#pick one multuplet. Look first at ERCC fractions for individual multuplets
sampleType <- c(
rep(
"Singlet",
ncol(getData(cObjSng, "counts"))
),
rep(
"Multuplet",
ncol(getData(cObjMul, "counts"))
)
)
counts <- cbind(
getData(cObjSng, "counts"),
getData(cObjMul, "counts")
)
counts.ercc <- cbind(
getData(cObjSng, "counts.ercc"),
getData(cObjMul, "counts.ercc")
)
frac.ercc <- colSums(counts.ercc) / (colSums(counts.ercc)+colSums(counts))
d <- data.frame(
sampleType = factor(sampleType, levels=c("Singlet", "Multuplet")),
frac.ercc=frac.ercc
)
ggplot(d, aes_string(x='sampleType', y='frac.ercc'))+
geom_jitter()+
labs(
x="Sample type",
y="Fraction of ERCC",
title="Fraction ERCC in singlets/doublets"
)+
theme_few()+
theme(
legend.position="top",
legend.title=element_blank(),
legend.text=element_text(size=15),
axis.title=element_text(size=17),
axis.text=element_text(size=15),
plot.title=element_text(
hjust=0.5,
family="Arial",
face="bold",
size=24,
margin=margin(b=15)
)
)+
guides(
colour=guide_legend(override.aes=list(size=5))
)

#calculate mean fraction of ERCC reads for Sng and Mul
ddply(d, "sampleType", summarize, mean=mean(frac.ercc))
## sampleType mean
## 1 Singlet 0.24687884
## 2 Multuplet 0.07582085
#find Mul closest to mean
x <- d[d$frac.ercc < 0.076 & d$frac.ercc > 0.072 & d$sampleType == "Multuplet", ]
x
## sampleType frac.ercc
## m.X1000102901.G10 Multuplet 0.07299422
## m.X1000102901.F3 Multuplet 0.07302992
## m.X1000102901.A5 Multuplet 0.07246207
## m.X1000102901.C5 Multuplet 0.07203692
#look at total reads for these Mul8
colSums(counts[,colnames(counts) %in% rownames(x)])
## m.X1000102901.G10 m.X1000102901.F3 m.X1000102901.A5 m.X1000102901.C5
## 538125 359809 642525 604593
#it appears as though Mul m.X1000102901.A5 has the highest count number, use it
name <- "m.X1000102901.A5"
Mul <- as.matrix(expCounts[,colnames(expCounts) == name])
MulERCC <- as.matrix(expErcc[,colnames(expErcc) == name])
colnames(Mul) <- name
colnames(MulERCC) <- name
cObjMul <- spCounts(Mul, MulERCC)
#note that the spCounts function does not work with only 1 multuplet i.e. counts.log and other slots are wrong, build the object by hand below
cObjMul@counts <- Mul
cObjMul@counts.ercc <- MulERCC
cObjMul@counts.log <- log2(Mul/colSums(Mul)*1000000+1)
cObjMul@counts.cpm <- Mul/colSums(Mul)*1000000+1
#plot markers and try to determine the cell types in the multuplet
spPlot(cObjSng, cObjMul, type="markers", markers=c("INS", "GCG"))

spPlot(cObjSng, cObjMul, type="markers", markers=c("SST", "THY1"))

spPlot(cObjSng, cObjMul, type="markers", markers=c("FLT1", "THY1"))

spPlot(cObjSng, cObjMul, type="markers", markers=c("PROM1", "THY1"))

spPlot(cObjSng, cObjMul, type="markers", markers=c("PRSS1", "THY1"))

spPlot(cObjSng, cObjMul, type="markers", markers=c("PTPRC", "THY1"))

spPlot(cObjSng, cObjMul, type="markers", markers=c("NEUROG3", "THY1"))

#preliminairly it seems that the multuplet is Flt1 and Thy1 positive indicating endotheliat and mesenchymal subtypes
#use the old uObj
#looks like spSwarm isn't working either with one multuplet. Perform step by step below
distFun = distToSlice
maxiter = 10
swarmsize = 150
cores=1
counts <- getData(cObjMul, "counts.cpm")
groupMeans <- getData(uObj, "groupMeans")
fractions <- rep(1.0/(dim(groupMeans)[2]), (dim(groupMeans)[2]))
selectInd <- getData(uObj, "selectInd")
cellTypes <- groupMeans[selectInd, ]
slice <- matrix(counts[selectInd, ], ncol=1)
colnames(slice) <- colnames(counts)
control=list(maxit=maxiter, s=swarmsize)
optim.fn <- function(i) {
oneslice <- slice[,i]
psoptim(
par=fractions,
fn=distFun,
cellTypes=cellTypes,
slice=oneslice,
lower=0,
upper=1,
control=control
)
}
distToSlice <- function(fractions, cellTypes, slice) {
if(sum(fractions) == 0) {
return(999999999)
}
normFractions <- fractions / sum(fractions)
a = .makeSyntheticSlice(cellTypes, normFractions)
sum(abs(a - slice)) # Abs dist
}
tmp <- mclapply(1:(dim(slice)[2]), optim.fn, mc.cores=cores)
output <- data.frame()
cost <- c()
counts <- tmp[[1]][[3]]
convergence <-ifelse(
tmp[[1]][[4]] == 1,
"Maximal number of function evaluations reached.",
ifelse(
tmp[[1]][[4]] == 2,
"Maximal number of iterations reached.",
ifelse(
tmp[[1]][[4]] == 3,
"Maximal number of restarts reached.",
"Maximal number of iterations without improvement reached."
)
)
)
for(i in 1:length(tmp)) {
curr <- tmp[[i]]
output <- rbind(output, as.data.frame(t(curr[[1]])))
cost <- c(cost, curr[[2]])
}
colnames(output) <- colnames(cellTypes)
rownames(output) <- colnames(slice)
#show the optimized vector
output
## A1 B1 C1 D1 E1 F1
## m.X1000102901.A5 0.03847916 0.2574927 0.0178159 0.001835303 0 0.04976419
## G1 H1 I1 J1 K1 L1
## m.X1000102901.A5 0.9899065 0 0.02711936 0.03888927 0.0002771224 0.1387781
## M1
## m.X1000102901.A5 0.04953655
#plot the uObj
spPlot(uObj, cObjSng, type="markers", markers=c("THY1", "FLT1"))

spPlot(uObj, type="clusters")

#So it appears as though the optimizaton has managed to find the correct result. Although this is difficult to tell since one of the cell types is mesenchymal which is represented by 5 classes in the unsupervised clustering
#check what spPoisson finds
edge.cutoff = 1/13.5
min.pval=1
min.num.edges=1
swarmT <- t(as.data.frame(output))
sobj.bool <- swarmT > edge.cutoff #nte that this finds class B1 to also be represented in the swarm results
nodes <- rownames(swarmT)
nclusts <- length(nodes)
edges <- data.frame(
from=rep(nodes, each=nclusts),
to=rep(nodes, nclusts),
weight=rep(0, nclusts^2)
)
for(i in 1:(dim(sobj.bool)[2])) {
o <- which(sobj.bool[,i])
if(length(o) > 1) {
for(j in 1:(length(o)-1)) {
for(k in (j+1):length(o)) {
ind1 <- (o[j]-1)*nclusts+o[k]
edges[ind1,3] <- edges[ind1,3]+1
ind2 <- (o[k]-1)*nclusts+o[j]
edges[ind2,3] <- edges[ind2,3]+1
}
}
}
}
mean.edges <- mean(edges$weight)
edges$pval <- ppois(edges$weight, mean.edges, lower.tail=FALSE)
out <- edges[edges$pval < min.pval & edges$weight >= min.num.edges, ]
out
## from to weight pval
## 20 B1 G1 1 0.0006155101
## 25 B1 L1 1 0.0006155101
## 80 G1 B1 1 0.0006155101
## 90 G1 L1 1 0.0006155101
## 145 L1 B1 1 0.0006155101
## 150 L1 G1 1 0.0006155101
#the results indicate one of two things:
#a) the multuplet is actually a triplet and that the cell types are mesenchymal (E1), endothelial (G1), and progenitor (B1)
#b) the progenitor connection is a flase positive indicating the importance of correctly setting the edge.cutoff paramater
#investigate class B1 genes to better ascertain if the multuplet contains a progenitor cell
fc <- groupMeans[,colnames(groupMeans) == "B1"] / rowMeans(groupMeans[,colnames(groupMeans) != "B1"])
fc <- sort(fc, decreasing=TRUE)
spPlot(cObjSng, cObjMul, type="markers", markers=names(fc)[1:2])

spPlot(cObjSng, cObjMul, type="markers", markers=names(fc)[3:4]) #ZNF600 looks to be present in the multuplet

spPlot(cObjSng, cObjMul, type="markers", markers=names(fc)[5:6])

spPlot(cObjSng, cObjMul, type="markers", markers=names(fc)[7:8])

spPlot(cObjSng, cObjMul, type="markers", markers=names(fc)[9:10])

spPlot(cObjSng, cObjMul, type="markers", markers=names(fc)[11:12]) #SNX11 present

spPlot(cObjSng, cObjMul, type="markers", markers=names(fc)[13:14])

spPlot(cObjSng, cObjMul, type="markers", markers=names(fc)[15:16])

spPlot(cObjSng, cObjMul, type="markers", markers=names(fc)[17:18])

spPlot(cObjSng, cObjMul, type="markers", markers=names(fc)[19:20])

#check the cell populations where the 2 discovered markers are expressed to judge how well they represent the B1 class
spPlot(uObj, cObjSng, type="markers", markers=c("ZNF600", "SNX11"))

#results are a bit hard to interpret. SNX11 is highly upregulated in 2 cells in B1 although is also present in on cell in G1.
#ZNF600 is highly upregulated in 1 cell in B1 but also present in serveal cells in other populations
#try running differential expression analysis to find B1 specific genes
countData <- getData(cObjSng, "counts")
colData <- data.frame(class=getData(uObj, "classification"))
#setup comparison
counts <- countData
colData$class <- factor(ifelse(colData$class %in% c("B1"), "B1", "others"), levels=c("B1", "others"))
dds <- DESeqDataSetFromMatrix(countData = counts, colData = colData, design = ~ class)
## converting counts to integer mode
hist(log2(rowMeans(counts(dds))), breaks=100)
abline(v=log2(0.15), col="red")

dds <- dds[ rowMeans(counts(dds)) >0.15, ]
dds$class <- relevel(dds$class, ref="others")
register(MulticoreParam(4))
dds <- DESeq(dds, fitType="local", parallel=TRUE)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 4 workers
## mean-dispersion relationship
## final dispersion estimates, MLE betas: 4 workers
## fitting model and testing: 4 workers
## -- replacing outliers and refitting for 11900 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
plotDispEsts(dds)

res <- results(dds)
res <- res[is.na(res$padj) == FALSE, ]
res <- res[order(res$log2FoldChange, decreasing=TRUE), ]
#there is only one significantly upregulated gene which is GHRL
#plot findings in the whole population
spPlot(uObj, cObjSng, type="markers", markers=rownames(res)[1:5])

spPlot(uObj, cObjSng, type="markers", markers=rownames(res)[6:10])

spPlot(uObj, cObjSng, type="markers", markers=rownames(res)[11:15])

spPlot(uObj, cObjSng, type="markers", markers=rownames(res)[16:20])

spPlot(cObjSng, cObjMul, type="markers", markers=rownames(res)[1:2])

spPlot(cObjSng, cObjMul, type="markers", markers=rownames(res)[3:4])

spPlot(cObjSng, cObjMul, type="markers", markers=rownames(res)[5:6])

spPlot(cObjSng, cObjMul, type="markers", markers=rownames(res)[7:8])

spPlot(cObjSng, cObjMul, type="markers", markers=rownames(res)[9:10])

spPlot(cObjSng, cObjMul, type="markers", markers=rownames(res)[11:12])

spPlot(cObjSng, cObjMul, type="markers", markers=rownames(res)[13:14]) #IDS is present in the Mul

spPlot(cObjSng, cObjMul, type="markers", markers=rownames(res)[15:16]) #BBS1 is present in the Mul

spPlot(cObjSng, cObjMul, type="markers", markers=rownames(res)[17:18])

spPlot(cObjSng, cObjMul, type="markers", markers=rownames(res)[19:20])

#replot and evaluate BBS1 and IDS
spPlot(uObj, type="clusters")

spPlot(uObj, cObjSng, type="markers", markers=c("BBS1", "IDS"))

#BBS1 seems isolated to the progenitor (B1) population in the single cells
#IDS has some expression within the mesenchymal subset although none in the E1 population
#this would seem to indicate that the optimization results are correct although it would be ideal to repeate this analysis with an additional multuplet